diamonds = data.frame(read.csv("diamonds.csv"))
diamonds$color = factor(diamonds$color, order=T, levels = c('J', 'I', 'H', 'G', 'F', 'E', 'D'))
diamonds$clarity = factor(diamonds$clarity, order=T, levels = c('I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'))
diamonds$cut = factor(diamonds$cut, order=T, levels = c('Fair', 'Good', 'Very Good', 'Premium', 'Ideal'))
colnames(diamonds)[1] = "ID"
sum = xkablesummary(diamonds, title = "Summary of Diamonds")
sum
| ID | carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Min. : 1 | Min. :0.20 | Fair : 1610 | J: 2808 | SI1 :13065 | Min. :43.0 | Min. :43.0 | Min. : 326 | Min. : 0.00 | Min. : 0.0 | Min. : 0.0 |
| Q1 | 1st Qu.:13486 | 1st Qu.:0.40 | Good : 4906 | I: 5422 | VS2 :12258 | 1st Qu.:61.0 | 1st Qu.:56.0 | 1st Qu.: 950 | 1st Qu.: 4.71 | 1st Qu.: 4.7 | 1st Qu.: 2.9 |
| Median | Median :26970 | Median :0.70 | Very Good:12082 | H: 8304 | SI2 : 9194 | Median :61.8 | Median :57.0 | Median : 2401 | Median : 5.70 | Median : 5.7 | Median : 3.5 |
| Mean | Mean :26970 | Mean :0.80 | Premium :13791 | G:11292 | VS1 : 8171 | Mean :61.7 | Mean :57.5 | Mean : 3933 | Mean : 5.73 | Mean : 5.7 | Mean : 3.5 |
| Q3 | 3rd Qu.:40455 | 3rd Qu.:1.04 | Ideal :21551 | F: 9542 | VVS2 : 5066 | 3rd Qu.:62.5 | 3rd Qu.:59.0 | 3rd Qu.: 5324 | 3rd Qu.: 6.54 | 3rd Qu.: 6.5 | 3rd Qu.: 4.0 |
| Max | Max. :53940 | Max. :5.01 | NA | E: 9797 | VVS1 : 3655 | Max. :79.0 | Max. :95.0 | Max. :18823 | Max. :10.74 | Max. :58.9 | Max. :31.8 |
| NA | NA | NA | NA | D: 6775 | (Other): 2531 | NA | NA | NA | NA | NA | NA |
barplot(table(diamonds$color), col = c(2, 3, 4, 7, 6, 8, 9), main = 'Barplot of Color', xlab = "Color", ylab = "Frequency")
color plot
barplot(table(diamonds$clarity), col = c(2, 3, 4, 7, 6, 8, 9, 5), main = 'Barplot of Clarity', xlab = "Clarity", ylab = "Frequency")
clarity plot
barplot(table(diamonds$cut), col = c(6, 7, 8, 3, 4), main = 'Barplot of Cut', xlab = "Cut", ylab = "Frequency")
cut_count = dplyr::count(diamonds, cut)
colnames(cut_count)[colnames(cut_count) == 'cut'] <- 'Cut'
cut_count$n = cut_count$n/sum(cut_count$n)
loadPkg("ggplot2")
ggplot(cut_count, aes(x = "", y = n, fill = Cut)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
geom_text(size = 4, aes(label = paste0(round(n, 2), "")), position = position_stack(vjust = 0.5))+
scale_fill_manual(values = c(6, 7, 8, 3, 4))+
labs(title='Pie Chart of Cut')
cut plot
In this section we will look at all the continuous variables besides price.
diaNO = subset(diamonds, x > 0 & y > 0 & z > 0 & y <30 & z <30)
ggplot(diaNO, aes(x, y, color=z)) +
geom_point(size = 8, alpha = 0.6) +
ggtitle('Y versus X versus Z for Diamonds (Outliers Removed)') +
scale_color_gradient2(midpoint = 5, low="blue", mid="green", high="red")
x vs y vs z
ggplot(diamonds, aes(y=table)) +
geom_histogram(col="black",
fill="green",
alpha = .8,
binwidth = 1.5) +
labs(title='Histogram of Table') +
coord_flip() +
labs(y="Table", x="Frequency")
table plot
ggplot(diamonds, aes(y=depth)) +
geom_histogram(col="black",
fill="red",
alpha = .8,
binwidth = 1) +
coord_flip() +
labs(title='Histogram of Depth') +
labs(y="Depth", x="Frequency")
depth plot
ggplot(diamonds, aes(y=carat)) +
geom_boxplot() +
geom_boxplot(colour="black", fill="orange", outlier.colour="orange", outlier.size=5, alpha = 0.8) +
labs(title = 'Boxplot of Carat')+
labs(y="Carat")
carat plot
ggplot(diamonds, aes(y=price)) +
geom_boxplot() +
geom_boxplot(colour="black", fill="blue", outlier.colour="blue", outlier.size=5, alpha = 0.8) +
labs(title = 'Boxplot of Price')
ggplot(diamonds, aes(y=price)) +
geom_histogram(col="black",
fill="blue",
alpha = .8,
binwidth = 300) +
coord_flip() +
labs(title='Histogram of Price') +
labs(y="Price", x="Frequency")
qqnorm(diamonds$price, main="Q-Q plot of Price", col = 'blue', ylab = 'Price')
qqline(diamonds$price)
price plot
one model for each individual
modelcarat <- lm(price ~ carat,data=diamonds)
modeldepth <- lm(price ~ depth,data=diamonds)
modeltable <- lm(price ~ table,data=diamonds)
modelx <- lm(price ~ x,data=diamonds)
modely <- lm(price ~ y,data=diamonds)
modelz <- lm(price ~ z,data=diamonds)
loadPkg("sjPlot")
loadPkg("sjmisc")
loadPkg("sjlabelled")
tab_model(modelcarat)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -2256.36 | -2281.95 – -2230.77 | <0.001 |
| carat | 7756.43 | 7728.86 – 7784.00 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.849 / 0.849 | ||
tab_model(modeldepth)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 5763.67 | 4312.17 – 7215.16 | <0.001 |
| depth | -29.65 | -53.15 – -6.15 | 0.013 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.000 / 0.000 | ||
tab_model(modeltable)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -9109.05 | -9968.41 – -8249.68 | <0.001 |
| table | 226.98 | 212.04 – 241.93 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.016 / 0.016 | ||
tab_model(modelx)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -14094.06 | -14175.85 – -14012.26 | <0.001 |
| x | 3145.41 | 3131.41 – 3159.42 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.782 / 0.782 | ||
tab_model(modely)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -13402.03 | -13488.39 – -13315.66 | <0.001 |
| y | 3022.89 | 3008.12 – 3037.66 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.749 / 0.749 | ||
tab_model(modelz)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -13296.57 | -13384.05 – -13209.08 | <0.001 |
| z | 4868.79 | 4844.55 – 4893.04 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.742 / 0.742 | ||
ANOVA of models
anovaRes = anova(modelcarat,modelx,modely,modelz)
xkabledply(anovaRes, title = "ANOVA comparison between the models")
| Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
|---|---|---|---|---|---|
| 53938 | 1.29e+11 | NA | NA | NA | NA |
| 53938 | 1.87e+11 | 0 | -5.76e+10 | NA | NA |
| 53938 | 2.16e+11 | 0 | -2.86e+10 | NA | NA |
| 53938 | 2.22e+11 | 0 | -6.18e+09 | NA | NA |
loadPkg("modelr")
df.with.predfin.from.model <- add_predictions(diamonds,modelcarat)
head(df.with.predfin.from.model)
loadPkg("ggplot2")
ggplot(df.with.predfin.from.model,aes(price,pred))+geom_point(aes(price,pred), alpha =0.2)+geom_line(aes(pred), colour="red", size=1) + labs(title = "Carat Linear Model to Predict Price", x = "Price", y = "Prediction") + xlim(c(0, 20000))
loadPkg("car")
avPlots(modelcarat)
Carat is the best singular model
diamondsnum = subset(diamonds, select = c("price", "carat", "depth", "table", "x", "y", "z"))
diamondscor = cor(diamondsnum)
loadPkg("corrplot")
corrplot.mixed(diamondscor)
high correlation between many variables
#loadPkg("lattice")
#pairs(diamondsnum[1:4])
removed the pairs plot for now cause I did not use in presentation and it takes too long to run
modelcaratx <- lm(price ~ carat+x,data=diamonds)
tab_model(modelcaratx)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 1737.95 | 1534.85 – 1941.05 | <0.001 |
| carat | 10125.99 | 10003.38 – 10248.59 | <0.001 |
| x | -1026.86 | -1078.67 – -975.05 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.853 / 0.853 | ||
xkablevif(modelcaratx)
| carat | x |
|---|---|
| 20.3 | 20.3 |
modelcarattable <- lm(price ~ carat+table,data=diamonds)
tab_model(modelcarattable)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 1961.99 | 1625.24 – 2298.74 | <0.001 |
| carat | 7820.04 | 7792.16 – 7847.92 | <0.001 |
| table | -74.30 | -80.22 – -68.39 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.851 / 0.851 | ||
xkablevif(modelcarattable)
| carat | table |
|---|---|
| 1.03 | 1.03 |
modeldepthx <- lm(price ~ depth+x,data=diamonds)
tab_model(modeldepthx)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -16116.57 | -16800.73 – -15432.42 | <0.001 |
| depth | 32.66 | 21.69 – 43.62 | <0.001 |
| x | 3146.47 | 3132.46 – 3160.47 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.782 / 0.782 | ||
xkablevif(modeldepthx)
| depth | x |
|---|---|
| 1 | 1 |
Every multinumeric model is either a worse predictor, or has higher vif values
There is no way to fix this from looking online
trying out different models
modelcaratcut <- lm(price ~ carat+cut, data = diamonds)
tab_model(modelcaratcut)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -2701.38 | -2731.62 – -2671.13 | <0.001 |
| carat | 7871.08 | 7843.68 – 7898.48 | <0.001 |
| cut [linear] | 1239.80 | 1188.64 – 1290.96 | <0.001 |
| cut [quadratic] | -528.60 | -573.94 – -483.26 | <0.001 |
| cut [cubic] | 367.91 | 328.29 – 407.53 | <0.001 |
| cut [4th degree] | 74.59 | 42.76 – 106.42 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.856 / 0.856 | ||
xkablevif(modelcaratcut)
| carat | cut.C | cut.L | cut.Q | cut^4 |
|---|---|---|---|---|
| 1.04 | 2 | 2.67 | 1.75 | 1.24 |
modelcaratclarity <- lm(price ~ carat+clarity, data = diamonds)
tab_model(modelcaratclarity)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -2977.27 | -3002.97 – -2951.57 | <0.001 |
| carat | 8440.06 | 8415.26 – 8464.85 | <0.001 |
| clarity [linear] | 4216.78 | 4150.81 – 4282.74 | <0.001 |
| clarity [quadratic] | -1931.41 | -1993.87 – -1868.94 | <0.001 |
| clarity [cubic] | 1005.85 | 952.16 – 1059.54 | <0.001 |
| clarity [4th degree] | -480.18 | -523.19 – -437.17 | <0.001 |
| clarity [5th degree] | 283.94 | 248.71 – 319.18 | <0.001 |
| clarity [6th degree] | 12.66 | -18.07 – 43.40 | 0.419 |
| clarity [7th degree] | 198.05 | 170.97 – 225.13 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.895 / 0.895 | ||
xkablevif(modelcaratclarity)
| carat | clarity.C | clarity.L | clarity.Q | clarity^4 | clarity^5 | clarity^6 | clarity^7 |
|---|---|---|---|---|---|---|---|
| 1.16 | 2.36 | 1.87 | 2.24 | 1.92 | 1.48 | 1.28 | 1.12 |
This model is the best
modelcaratcolor <- lm(price ~ carat+color, data = diamonds)
tab_model(modelcaratcolor)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -2702.23 | -2729.25 – -2675.22 | <0.001 |
| carat | 8066.62 | 8039.11 – 8094.14 | <0.001 |
| color [linear] | 1572.20 | 1528.46 – 1615.94 | <0.001 |
| color [quadratic] | -741.14 | -781.13 – -701.16 | <0.001 |
| color [cubic] | 122.70 | 85.17 – 160.22 | <0.001 |
| color [4th degree] | 78.77 | 44.30 – 113.23 | <0.001 |
| color [5th degree] | 144.74 | 112.16 – 177.32 | <0.001 |
| color [6th degree] | -180.75 | -210.30 – -151.19 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.864 / 0.864 | ||
xkablevif(modelcaratcolor)
| carat | color.C | color.L | color.Q | color^4 | color^5 | color^6 |
|---|---|---|---|---|---|---|
| 1.1 | 1.28 | 1.22 | 1.2 | 1.16 | 1.06 | 1.03 |
modelcaratccc <- lm(price ~ carat+color+cut+clarity, data = diamonds)
tab_model(modelcaratccc)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -3710.60 | -3738.01 – -3683.20 | <0.001 |
| carat | 8886.13 | 8862.54 – 8909.72 | <0.001 |
| color [linear] | 1910.29 | 1875.57 – 1945.00 | <0.001 |
| color [quadratic] | -627.95 | -659.55 – -596.36 | <0.001 |
| color [cubic] | 171.96 | 142.42 – 201.50 | <0.001 |
| color [4th degree] | 21.68 | -5.45 – 48.80 | 0.117 |
| color [5th degree] | 85.94 | 60.31 – 111.57 | <0.001 |
| color [6th degree] | -49.99 | -73.29 – -26.68 | <0.001 |
| cut [linear] | 698.91 | 659.05 – 738.76 | <0.001 |
| cut [quadratic] | -327.69 | -362.79 – -292.58 | <0.001 |
| cut [cubic] | 180.57 | 150.07 – 211.06 | <0.001 |
| cut [4th degree] | -1.21 | -25.63 – 23.21 | 0.923 |
| clarity [linear] | 4217.53 | 4157.11 – 4277.96 | <0.001 |
| clarity [quadratic] | -1832.41 | -1888.91 – -1775.90 | <0.001 |
| clarity [cubic] | 923.27 | 874.90 – 971.64 | <0.001 |
| clarity [4th degree] | -361.99 | -400.68 – -323.31 | <0.001 |
| clarity [5th degree] | 216.62 | 185.04 – 248.19 | <0.001 |
| clarity [6th degree] | 2.11 | -25.41 – 29.62 | 0.881 |
| clarity [7th degree] | 110.34 | 86.07 – 134.61 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.916 / 0.916 | ||
xkablevif(modelcaratccc)
| carat | clarity.C | clarity.L | clarity.Q | clarity^4 | clarity^5 | clarity^6 | clarity^7 | color.C | color.L | color.Q | color^4 | color^5 | color^6 | cut.C | cut.L | cut.Q | cut^4 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1.31 | 1.31 | 1.23 | 1.21 | 1.17 | 1.06 | 1.04 | 2.08 | 2.73 | 1.77 | 1.24 | 2.48 | 1.91 | 2.27 | 1.94 | 1.48 | 1.28 | 1.12 |
Now this model is the best
df.with.predfin.from.model <- add_predictions(diamonds,modelcaratccc)
head(df.with.predfin.from.model)
loadPkg("ggplot2")
ggplot(df.with.predfin.from.model,aes(price,pred))+geom_point(aes(price,pred), alpha =0.2)+geom_line(aes(pred), colour="red", size=1) + labs(title = "Carat + Color + Cut + Clarity Linear Model", x = "Price", y = "Prediction") + xlim(c(0, 20000))
almost looks like a polynomial curve?
modelcaratccci <- lm(price ~ carat * clarity, data = diamonds)
tab_model(modelcaratccci)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -2703.99 | -2736.96 – -2671.03 | <0.001 |
| carat | 8746.14 | 8709.34 – 8782.95 | <0.001 |
| clarity [linear] | -531.37 | -658.86 – -403.88 | <0.001 |
| clarity [quadratic] | 470.47 | 345.59 – 595.36 | <0.001 |
| clarity [cubic] | -926.21 | -1032.99 – -819.43 | <0.001 |
| clarity [4th degree] | 693.74 | 609.23 – 778.25 | <0.001 |
| clarity [5th degree] | -514.17 | -581.92 – -446.41 | <0.001 |
| clarity [6th degree] | 166.53 | 108.70 – 224.36 | <0.001 |
| clarity [7th degree] | -24.58 | -74.87 – 25.72 | 0.338 |
| carat * clarity [linear] | 5496.25 | 5361.14 – 5631.37 | <0.001 |
|
carat * clarity [quadratic] |
-1037.76 | -1165.70 – -909.82 | <0.001 |
| carat * clarity [cubic] | 1469.89 | 1354.08 – 1585.71 | <0.001 |
|
carat * clarity [4th degree] |
-943.92 | -1045.34 – -842.51 | <0.001 |
|
carat * clarity [5th degree] |
674.63 | 586.02 – 763.23 | <0.001 |
|
carat * clarity [6th degree] |
-30.09 | -106.66 – 46.48 | 0.441 |
|
carat * clarity [7th degree] |
304.46 | 242.54 – 366.39 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.909 / 0.909 | ||
xkablevif(modelcaratccci)
| carat | carat:clarity.C | carat:clarity.L | carat:clarity.Q | carat:clarity^4 | carat:clarity^5 | carat:clarity^6 | carat:clarity^7 | clarity.C | clarity.L | clarity.Q | clarity^4 | clarity^5 | clarity^6 | clarity^7 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2.94 | 10.1 | 8.59 | 10.2 | 8.52 | 6.28 | 5.22 | 4.44 | 10.2 | 8.21 | 10 | 11.6 | 9.9 | 7.65 | 5.19 |
pretty good model
modelcaratccci2 <- lm(price ~ carat + (color+cut+clarity)^2, data = diamonds)
tab_model(modelcaratccci2)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -3638.97 | -3676.26 – -3601.68 | <0.001 |
| carat | 8881.10 | 8857.62 – 8904.58 | <0.001 |
| color [linear] | 2336.13 | 2268.01 – 2404.24 | <0.001 |
| color [quadratic] | -388.93 | -451.66 – -326.21 | <0.001 |
| color [cubic] | 178.61 | 121.20 – 236.02 | <0.001 |
| color [4th degree] | 52.29 | 0.17 – 104.40 | 0.049 |
| color [5th degree] | 157.27 | 110.62 – 203.91 | <0.001 |
| color [6th degree] | -25.23 | -66.07 – 15.62 | 0.226 |
| cut [linear] | 746.14 | 659.74 – 832.54 | <0.001 |
| cut [quadratic] | -345.13 | -421.05 – -269.21 | <0.001 |
| cut [cubic] | 212.89 | 153.99 – 271.79 | <0.001 |
| cut [4th degree] | -1.07 | -46.51 – 44.36 | 0.963 |
| clarity [linear] | 4479.14 | 4360.03 – 4598.25 | <0.001 |
| clarity [quadratic] | -1635.44 | -1745.23 – -1525.66 | <0.001 |
| clarity [cubic] | 1013.53 | 914.76 – 1112.29 | <0.001 |
| clarity [4th degree] | -369.33 | -457.75 – -280.91 | <0.001 |
| clarity [5th degree] | 173.53 | 98.12 – 248.95 | <0.001 |
| clarity [6th degree] | -6.35 | -66.48 – 53.78 | 0.836 |
| clarity [7th degree] | 92.66 | 47.40 – 137.92 | <0.001 |
|
color [linear] * cut [linear] |
-156.52 | -285.46 – -27.59 | 0.017 |
|
color [quadratic] * cut [linear] |
-6.35 | -126.94 – 114.23 | 0.918 |
|
color [cubic] * cut [linear] |
191.17 | 76.29 – 306.04 | 0.001 |
|
color [4th degree] * cut [linear] |
115.55 | 5.20 – 225.90 | 0.040 |
|
color [5th degree] * cut [linear] |
4.62 | -96.38 – 105.62 | 0.929 |
|
color [6th degree] * cut [linear] |
-44.86 | -137.62 – 47.90 | 0.343 |
|
color [linear] * cut [quadratic] |
17.55 | -96.35 – 131.44 | 0.763 |
|
color [quadratic] * cut [quadratic] |
-10.65 | -117.32 – 96.03 | 0.845 |
|
color [cubic] * cut [quadratic] |
-117.92 | -219.52 – -16.32 | 0.023 |
|
color [4th degree] * cut [quadratic] |
-12.06 | -109.33 – 85.21 | 0.808 |
|
color [5th degree] * cut [quadratic] |
100.25 | 10.85 – 189.65 | 0.028 |
|
color [6th degree] * cut [quadratic] |
-3.23 | -85.19 – 78.72 | 0.938 |
|
color [linear] * cut [cubic] |
-315.71 | -413.50 – -217.92 | <0.001 |
|
color [quadratic] * cut [cubic] |
46.00 | -46.63 – 138.63 | 0.330 |
|
color [cubic] * cut [cubic] |
127.27 | 39.38 – 215.17 | 0.005 |
|
color [4th degree] * cut [cubic] |
33.14 | -50.13 – 116.42 | 0.435 |
|
color [5th degree] * cut [cubic] |
-139.38 | -217.23 – -61.53 | <0.001 |
|
color [6th degree] * cut [cubic] |
70.04 | -2.21 – 142.29 | 0.057 |
|
color [linear] * cut [4th degree] |
-132.75 | -211.30 – -54.20 | 0.001 |
|
color [quadratic] * cut [4th degree] |
14.70 | -60.42 – 89.82 | 0.701 |
|
color [cubic] * cut [4th degree] |
-68.91 | -139.77 – 1.95 | 0.057 |
|
color [4th degree] * cut [4th degree] |
25.37 | -40.91 – 91.66 | 0.453 |
|
color [5th degree] * cut [4th degree] |
82.61 | 19.92 – 145.31 | 0.010 |
|
color [6th degree] * cut [4th degree] |
9.26 | -49.18 – 67.70 | 0.756 |
|
color [linear] * clarity [linear] |
380.39 | 147.05 – 613.72 | 0.001 |
|
color [quadratic] * clarity [linear] |
1324.92 | 1107.69 – 1542.15 | <0.001 |
|
color [cubic] * clarity [linear] |
428.25 | 233.14 – 623.36 | <0.001 |
|
color [4th degree] * clarity [linear] |
515.76 | 342.37 – 689.14 | <0.001 |
|
color [5th degree] * clarity [linear] |
-100.04 | -251.31 – 51.23 | 0.195 |
|
color [6th degree] * clarity [linear] |
142.79 | 11.31 – 274.27 | 0.033 |
|
color [linear] * clarity [quadratic] |
1718.88 | 1495.84 – 1941.92 | <0.001 |
|
color [quadratic] * clarity [quadratic] |
1135.83 | 928.47 – 1343.18 | <0.001 |
|
color [cubic] * clarity [quadratic] |
350.58 | 163.45 – 537.70 | <0.001 |
|
color [4th degree] * clarity [quadratic] |
168.49 | 1.28 – 335.69 | 0.048 |
|
color [5th degree] * clarity [quadratic] |
351.43 | 205.15 – 497.71 | <0.001 |
|
color [6th degree] * clarity [quadratic] |
59.64 | -66.93 – 186.21 | 0.356 |
|
color [linear] * clarity [cubic] |
88.87 | -105.48 – 283.22 | 0.370 |
|
color [quadratic] * clarity [cubic] |
1046.58 | 865.45 – 1227.72 | <0.001 |
|
color [cubic] * clarity [cubic] |
515.80 | 352.99 – 678.62 | <0.001 |
|
color [4th degree] * clarity [cubic] |
242.40 | 98.06 – 386.74 | 0.001 |
|
color [5th degree] * clarity [cubic] |
-200.08 | -326.45 – -73.70 | 0.002 |
|
color [6th degree] * clarity [cubic] |
100.17 | -9.02 – 209.37 | 0.072 |
|
color [linear] * clarity [4th degree] |
777.96 | 620.19 – 935.73 | <0.001 |
|
color [quadratic] * clarity [4th degree] |
206.65 | 58.48 – 354.81 | 0.006 |
|
color [cubic] * clarity [4th degree] |
234.71 | 103.03 – 366.39 | <0.001 |
|
color [4th degree] * clarity [4th degree] |
100.72 | -13.75 – 215.19 | 0.085 |
|
color [5th degree] * clarity [4th degree] |
265.99 | 165.36 – 366.62 | <0.001 |
|
color [6th degree] * clarity [4th degree] |
-5.50 | -92.89 – 81.90 | 0.902 |
|
color [linear] * clarity [5th degree] |
143.16 | 13.26 – 273.06 | 0.031 |
|
color [quadratic] * clarity [5th degree] |
403.98 | 281.33 – 526.63 | <0.001 |
|
color [cubic] * clarity [5th degree] |
158.89 | 50.22 – 267.57 | 0.004 |
|
color [4th degree] * clarity [5th degree] |
153.65 | 60.42 – 246.88 | 0.001 |
|
color [5th degree] * clarity [5th degree] |
-85.84 | -168.85 – -2.84 | 0.043 |
|
color [6th degree] * clarity [5th degree] |
-16.20 | -88.41 – 56.01 | 0.660 |
|
color [linear] * clarity [6th degree] |
299.99 | 189.98 – 410.00 | <0.001 |
|
color [quadratic] * clarity [6th degree] |
111.82 | 7.88 – 215.77 | 0.035 |
|
color [cubic] * clarity [6th degree] |
274.00 | 180.41 – 367.59 | <0.001 |
|
color [4th degree] * clarity [6th degree] |
-58.39 | -139.50 – 22.72 | 0.158 |
|
color [5th degree] * clarity [6th degree] |
2.08 | -71.90 – 76.06 | 0.956 |
|
color [6th degree] * clarity [6th degree] |
17.90 | -46.48 – 82.27 | 0.586 |
|
color [linear] * clarity [7th degree] |
309.75 | 223.47 – 396.04 | <0.001 |
|
color [quadratic] * clarity [7th degree] |
-57.62 | -139.29 – 24.04 | 0.167 |
|
color [cubic] * clarity [7th degree] |
24.45 | -51.53 – 100.43 | 0.528 |
|
color [4th degree] * clarity [7th degree] |
40.22 | -27.54 – 107.98 | 0.245 |
|
color [5th degree] * clarity [7th degree] |
75.03 | 10.60 – 139.47 | 0.022 |
|
color [6th degree] * clarity [7th degree] |
32.36 | -24.66 – 89.38 | 0.266 |
|
cut [linear] * clarity [linear] |
96.57 | -221.04 – 414.17 | 0.551 |
|
cut [quadratic] * clarity [linear] |
-683.67 | -964.58 – -402.77 | <0.001 |
|
cut [cubic] * clarity [linear] |
-241.08 | -456.94 – -25.22 | 0.029 |
|
cut [4th degree] * clarity [linear] |
-124.53 | -295.30 – 46.23 | 0.153 |
|
cut [linear] * clarity [quadratic] |
424.52 | 132.64 – 716.40 | 0.004 |
|
cut [quadratic] * clarity [quadratic] |
-381.73 | -641.46 – -122.00 | 0.004 |
|
cut [cubic] * clarity [quadratic] |
406.38 | 202.99 – 609.77 | <0.001 |
|
cut [4th degree] * clarity [quadratic] |
-27.27 | -192.97 – 138.42 | 0.747 |
|
cut [linear] * clarity [cubic] |
434.99 | 169.75 – 700.22 | 0.001 |
|
cut [quadratic] * clarity [cubic] |
-591.53 | -825.89 – -357.17 | <0.001 |
|
cut [cubic] * clarity [cubic] |
253.30 | 71.68 – 434.93 | 0.006 |
|
cut [4th degree] * clarity [cubic] |
-24.85 | -167.49 – 117.79 | 0.733 |
|
cut [linear] * clarity [4th degree] |
349.97 | 104.55 – 595.39 | 0.005 |
|
cut [quadratic] * clarity [4th degree] |
-269.02 | -482.67 – -55.36 | 0.014 |
|
cut [cubic] * clarity [4th degree] |
191.84 | 32.48 – 351.19 | 0.018 |
|
cut [4th degree] * clarity [4th degree] |
-116.54 | -231.37 – -1.72 | 0.047 |
|
cut [linear] * clarity [5th degree] |
351.69 | 140.95 – 562.44 | 0.001 |
|
cut [quadratic] * clarity [5th degree] |
-367.28 | -549.40 – -185.16 | <0.001 |
|
cut [cubic] * clarity [5th degree] |
198.17 | 62.73 – 333.62 | 0.004 |
|
cut [4th degree] * clarity [5th degree] |
-32.54 | -125.13 – 60.05 | 0.491 |
|
cut [linear] * clarity [6th degree] |
55.85 | -108.32 – 220.01 | 0.505 |
|
cut [quadratic] * clarity [6th degree] |
-101.12 | -243.18 – 40.93 | 0.163 |
|
cut [cubic] * clarity [6th degree] |
124.59 | 13.87 – 235.31 | 0.027 |
|
cut [4th degree] * clarity [6th degree] |
-9.78 | -87.16 – 67.61 | 0.804 |
|
cut [linear] * clarity [7th degree] |
16.62 | -104.82 – 138.05 | 0.789 |
|
cut [quadratic] * clarity [7th degree] |
25.90 | -80.17 – 131.98 | 0.632 |
|
cut [cubic] * clarity [7th degree] |
-36.93 | -123.74 – 49.87 | 0.404 |
|
cut [4th degree] * clarity [7th degree] |
66.55 | 2.05 – 131.06 | 0.043 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.918 / 0.918 | ||
xkablevif(modelcaratccci2)
| carat | clarity.C | clarity.L | clarity.Q | clarity^4 | clarity^5 | clarity^6 | clarity^7 | color.C | color.C:clarity.C | color.C:clarity.L | color.C:clarity.Q | color.C:clarity^4 | color.C:clarity^5 | color.C:clarity^6 | color.C:clarity^7 | color.C:cut.C | color.C:cut.L | color.C:cut.Q | color.C:cut^4 | color.L | color.L:clarity.C | color.L:clarity.L | color.L:clarity.Q | color.L:clarity^4 | color.L:clarity^5 | color.L:clarity^6 | color.L:clarity^7 | color.L:cut.C | color.L:cut.L | color.L:cut.Q | color.L:cut^4 | color.Q | color.Q:clarity.C | color.Q:clarity.L | color.Q:clarity.Q | color.Q:clarity^4 | color.Q:clarity^5 | color.Q:clarity^6 | color.Q:clarity^7 | color.Q:cut.C | color.Q:cut.L | color.Q:cut.Q | color.Q:cut^4 | color^4 | color^4:clarity.C | color^4:clarity.L | color^4:clarity.Q | color4:clarity4 | color4:clarity5 | color4:clarity6 | color4:clarity7 | color^4:cut.C | color^4:cut.L | color^4:cut.Q | color4:cut4 | color^5 | color^5:clarity.C | color^5:clarity.L | color^5:clarity.Q | color5:clarity4 | color5:clarity5 | color5:clarity6 | color5:clarity7 | color^5:cut.C | color^5:cut.L | color^5:cut.Q | color5:cut4 | color^6 | color^6:clarity.C | color^6:clarity.L | color^6:clarity.Q | color6:clarity4 | color6:clarity5 | color6:clarity6 | color6:clarity7 | color^6:cut.C | color^6:cut.L | color^6:cut.Q | color6:cut4 | cut.C | cut.C:clarity.C | cut.C:clarity.L | cut.C:clarity.Q | cut.C:clarity^4 | cut.C:clarity^5 | cut.C:clarity^6 | cut.C:clarity^7 | cut.L | cut.L:clarity.C | cut.L:clarity.L | cut.L:clarity.Q | cut.L:clarity^4 | cut.L:clarity^5 | cut.L:clarity^6 | cut.L:clarity^7 | cut.Q | cut.Q:clarity.C | cut.Q:clarity.L | cut.Q:clarity.Q | cut.Q:clarity^4 | cut.Q:clarity^5 | cut.Q:clarity^6 | cut.Q:clarity^7 | cut^4 | cut^4:clarity.C | cut^4:clarity.L | cut^4:clarity.Q | cut4:clarity4 | cut4:clarity5 | cut4:clarity6 | cut4:clarity7 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1.34 | 5.18 | 5 | 4.68 | 4.43 | 3.62 | 3.27 | 10 | 13.1 | 6.79 | 4.42 | 9.89 | 7.42 | 9.76 | 10.4 | 8.7 | 6.31 | 4.02 | 3.87 | 4.02 | 3.82 | 4.12 | 3.46 | 3.56 | 3.22 | 3.32 | 3.1 | 3.28 | 2.81 | 2.8 | 2.04 | 2.17 | 2 | 2.04 | 1.84 | 1.87 | 1.49 | 1.55 | 1.46 | 1.44 | 1.33 | 1.31 | 4.13 | 4.5 | 3.69 | 3.33 | 2.69 | 2.48 | 4.91 | 5.27 | 4.34 | 4.12 | 3.14 | 2.94 | 4.69 | 4.84 | 4.03 | 3.58 | 2.86 | 2.5 | 3.57 | 3.73 | 3.05 | 2.65 | 2.1 | 1.88 | 2.85 | 2.95 | 2.41 | 2.01 | 1.65 | 1.44 | 2.38 | 2.42 | 2.05 | 1.73 | 1.48 | 1.28 | 1.62 | 1.69 | 1.49 | 1.39 | 1.23 | 1.15 | 16.1 | 12.4 | 6.23 | 4.22 | 15.3 | 14 | 7.08 | 4.99 | 16.2 | 13.3 | 6.82 | 4.74 | 16.5 | 12.7 | 6.28 | 3.52 | 13.7 | 10.7 | 5.19 | 2.69 | 9.43 | 7.42 | 3.88 | 2.17 | 6.02 | 4.68 | 2.67 | 1.62 |
high VIF values
modelcaratccci3 <- lm(price ~ (carat + clarity)^2, data = diamonds)
tab_model(modelcaratccci3)
| price | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -2703.99 | -2736.96 – -2671.03 | <0.001 |
| carat | 8746.14 | 8709.34 – 8782.95 | <0.001 |
| clarity [linear] | -531.37 | -658.86 – -403.88 | <0.001 |
| clarity [quadratic] | 470.47 | 345.59 – 595.36 | <0.001 |
| clarity [cubic] | -926.21 | -1032.99 – -819.43 | <0.001 |
| clarity [4th degree] | 693.74 | 609.23 – 778.25 | <0.001 |
| clarity [5th degree] | -514.17 | -581.92 – -446.41 | <0.001 |
| clarity [6th degree] | 166.53 | 108.70 – 224.36 | <0.001 |
| clarity [7th degree] | -24.58 | -74.87 – 25.72 | 0.338 |
| carat * clarity [linear] | 5496.25 | 5361.14 – 5631.37 | <0.001 |
|
carat * clarity [quadratic] |
-1037.76 | -1165.70 – -909.82 | <0.001 |
| carat * clarity [cubic] | 1469.89 | 1354.08 – 1585.71 | <0.001 |
|
carat * clarity [4th degree] |
-943.92 | -1045.34 – -842.51 | <0.001 |
|
carat * clarity [5th degree] |
674.63 | 586.02 – 763.23 | <0.001 |
|
carat * clarity [6th degree] |
-30.09 | -106.66 – 46.48 | 0.441 |
|
carat * clarity [7th degree] |
304.46 | 242.54 – 366.39 | <0.001 |
| Observations | 53940 | ||
| R2 / R2 adjusted | 0.909 / 0.909 | ||
xkablevif(modelcaratccci3)
| carat | carat:clarity.C | carat:clarity.L | carat:clarity.Q | carat:clarity^4 | carat:clarity^5 | carat:clarity^6 | carat:clarity^7 | clarity.C | clarity.L | clarity.Q | clarity^4 | clarity^5 | clarity^6 | clarity^7 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2.94 | 10.1 | 8.59 | 10.2 | 8.52 | 6.28 | 5.22 | 4.44 | 10.2 | 8.21 | 10 | 11.6 | 9.9 | 7.65 | 5.19 |
pretty great again, but high VIF
Nothing with interaction terms beats the carat + ccc model
loadPkg("leaps")
#This is essentially best fit
reg.best10 <- regsubsets(price~. , data = diamonds, nvmax = 10, nbest = 1, method = "exhaustive") # leaps::regsubsets() - Model selection by exhaustive (default) search, forward or backward stepwise, or sequential replacement
#The plot will show the Adjust R^2 when using the variables across the bottom
plot(reg.best10, scale = "adjr2", main = "Exhaustive Adjusted R^2")
all types of models look the same, so just stick to adjusted R squared
reg.forward10 <- regsubsets(price~., data = diamonds, nvmax = 10, nbest = 1, method = "forward")
plot(reg.forward10, scale = "adjr2", main = "Forward Adjusted R^2")
# summary(reg.forward10)
again all the same
reg.forward10 <- regsubsets(price~., data = diamonds, nvmax = 10, nbest = 1, method = "backward")
plot(reg.forward10, scale = "adjr2", main = "Backward Adjusted R^2")
# summary(reg.forward10)
same
reg.forward10 <- regsubsets(price~., data = diamonds, nvmax = 10, nbest = 1, method = "seqrep")
plot(reg.forward10, scale = "adjr2", main = "Sequential Replacement Adjusted R^2")
# summary(reg.forward10)
same
All feature selections reflects our findings, and is all the same basically
We first subset the continuous variables from the dataframe to a new dataframe.
diamondscont <- subset(diamonds, select = c ("carat", "depth", "table", "price", "x", "y", "z"))
This data is not scaled and hence we have to standardize the data.
diamondscont <- as.data.frame(scale(diamondscont))
head(diamondscont)
## carat depth table price x y z
## 1 -1.20 -0.174 -1.100 -0.904 -1.59 -1.54 -1.57
## 2 -1.24 -1.361 1.586 -0.904 -1.64 -1.66 -1.74
## 3 -1.20 -3.385 3.376 -0.904 -1.50 -1.46 -1.74
## 4 -1.07 0.454 0.243 -0.902 -1.36 -1.32 -1.29
## 5 -1.03 1.082 0.243 -0.902 -1.24 -1.21 -1.12
## 6 -1.18 0.733 -0.205 -0.902 -1.60 -1.55 -1.50
Running the PCR model using pls library :
library(pls)
pcr_model <- pcr(price~., data = diamondscont,ncomp=6, validation = "CV")
summary(pcr_model)
## Data: X dimension: 53940 6
## Y dimension: 53940 1
## Fit method: svdpc
## Number of components considered: 6
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 1 0.452 0.4497 0.4417 0.3887 0.3858 0.3759
## adjCV 1 0.452 0.4497 0.4417 0.3837 0.3863 0.3758
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## X 65.54 86.94 98.33 99.12 99.78 100.00
## price 79.57 79.77 80.49 85.01 85.21 85.92
validationplot(pcr_model)
pcrcomp <- prcomp(diamondscont, scale = TRUE)
summary(pcrcomp)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.183 1.134 0.8312 0.4168 0.20077 0.18151 0.11135
## Proportion of Variance 0.681 0.184 0.0987 0.0248 0.00576 0.00471 0.00177
## Cumulative Proportion 0.681 0.864 0.9629 0.9878 0.99352 0.99823 1.00000
pcrcomp$rotation
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## carat 0.452445 -0.03470 0.00549 -0.0684 0.1340 0.7682 0.42588
## depth -0.000916 -0.73068 -0.67283 -0.0472 -0.0887 0.0145 -0.05560
## table 0.099516 0.67507 -0.72807 -0.0595 -0.0104 -0.0253 -0.00205
## price 0.425519 -0.03526 0.10545 -0.8498 -0.0538 -0.2733 -0.08281
## x 0.453213 0.00351 0.03951 0.2430 0.0890 0.1985 -0.82866
## y 0.447265 0.00216 0.05419 0.3285 -0.7741 -0.2153 0.20886
## z 0.445954 -0.08904 -0.03960 0.3170 0.6034 -0.4987 0.27996
We can see from the results of the PCR model that with just one component we can explain around 79 % of variation in price and as the components increase, the variation in price explained increases very marginally so it would make sense to build a model with just one variable.
We will try to build a linear model with one variable from the PC1. x, y, z and carat have highest probabilities.
linear_model_pcr_1a = lm(price~ x , data= diamondscont)
summary(linear_model_pcr_1a)
##
## Call:
## lm(formula = price ~ x, data = diamondscont)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.112 -0.317 -0.046 0.244 8.053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.24e-14 2.01e-03 0 1
## x 8.84e-01 2.01e-03 440 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.467 on 53938 degrees of freedom
## Multiple R-squared: 0.782, Adjusted R-squared: 0.782
## F-statistic: 1.94e+05 on 1 and 53938 DF, p-value: <2e-16
linear_model_pcr_1b = lm(price~ y , data= diamondscont)
summary(linear_model_pcr_1b)
##
## Call:
## lm(formula = price ~ y, data = diamondscont)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.21 -0.31 -0.06 0.21 7.88
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.52e-15 2.16e-03 0 1
## y 8.65e-01 2.16e-03 401 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.501 on 53938 degrees of freedom
## Multiple R-squared: 0.749, Adjusted R-squared: 0.749
## F-statistic: 1.61e+05 on 1 and 53938 DF, p-value: <2e-16
linear_model_pcr_1c = lm(price~ z , data= diamondscont)
summary(linear_model_pcr_1c)
##
## Call:
## lm(formula = price ~ z, data = diamondscont)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.98 -0.31 -0.06 0.21 8.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.56e-15 2.19e-03 0 1
## z 8.61e-01 2.19e-03 394 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.508 on 53938 degrees of freedom
## Multiple R-squared: 0.742, Adjusted R-squared: 0.742
## F-statistic: 1.55e+05 on 1 and 53938 DF, p-value: <2e-16
linear_model_pcr_1d = lm(price~ depth , data= diamondscont)
summary(linear_model_pcr_1d)
##
## Call:
## lm(formula = price ~ depth, data = diamondscont)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.944 -0.748 -0.381 0.350 3.744
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.73e-15 4.31e-03 0.00 1.000
## depth -1.06e-02 4.31e-03 -2.47 0.013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1 on 53938 degrees of freedom
## Multiple R-squared: 0.000113, Adjusted R-squared: 9.48e-05
## F-statistic: 6.12 on 1 and 53938 DF, p-value: 0.0134
linear_model_pcr_1e = lm(price~ table , data= diamondscont)
summary(linear_model_pcr_1e)
##
## Call:
## lm(formula = price ~ table, data = diamondscont)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.635 -0.690 -0.374 0.343 3.947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.04e-14 4.27e-03 0.0 1
## table 1.27e-01 4.27e-03 29.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.992 on 53938 degrees of freedom
## Multiple R-squared: 0.0162, Adjusted R-squared: 0.0161
## F-statistic: 886 on 1 and 53938 DF, p-value: <2e-16
linear_model_pcr_1f = lm(price~ carat , data= diamondscont)
summary(linear_model_pcr_1f)
##
## Call:
## lm(formula = price ~ carat, data = diamondscont)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.659 -0.202 -0.005 0.135 3.191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.36e-14 1.67e-03 0 1
## carat 9.22e-01 1.67e-03 551 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.388 on 53938 degrees of freedom
## Multiple R-squared: 0.849, Adjusted R-squared: 0.849
## F-statistic: 3.04e+05 on 1 and 53938 DF, p-value: <2e-16
We can see from the above models that if we have to choose one variable to best explain the variation it price, it has to be the carat of the diamond because it has the highest adjusted r squared value among the other variables.
If we try to build a model with more than one variable as above, we can see that the adjusted r squared value of the model increases very slightly, which means that the variation in price is explained only a little better than the model with just one variable. If we see the vif values which checks the collinearity of the variable, we can see that all the continuous variables are highly collinear.
In this section we will be using the Random Forest model to build a classification model to be able to effectively classify diamonds into ‘Fair’, ‘Good’, ‘Very Good’, ‘Premium’, ‘Ideal’ diamond cuts, using the base features.
Prior to Random Forest Classification, Multi-nomial Logistic regression had been used but a very low accuracy score was found.
We start by removing missing values and outliers to make a good classification.
mydiamonds=diamonds
mydiamonds[ mydiamonds == "?"] <- NA
colSums(is.na(mydiamonds))
# str(mydiamonds)
# boxplot(mydiamonds$price)
# any(is.na.data.frame(mydiamonds_clean))
mydiamonds_clean= subset(mydiamonds,!mydiamonds$price %in% boxplot.stats(mydiamonds$price)$out)
Upon removing outliers, 3538 outliers were detected and removed.
Next, we split our data set into training and testing set with 80% and 20% respectively.
library(randomForest)
require(caTools)
library(MASS)
library(caret)
set.seed(1)
dataset= mydiamonds_clean[,2:11]
test= createDataPartition(dataset$cut,p=.2, list= FALSE)
data_train= dataset[-test,]
data_test= dataset[test,]
Before we dive into Random Forest Regression, lets perform a multinomial Logistic Regression and take a look at the accuracy score.
require(nnet)
# Training the multinomial model
multinom_model <- multinom(cut ~ ., data = data_test)
# Checking the model
summary(multinom_model)
# Predicting the values for train dataset
classPredicted <- predict(multinom_model, newdata = data_test, "class")
# Confusion matrix
library(caret)
cmlogit=confusionMatrix(data_test$cut,classPredicted)
Logistic Regression has an accuracy score of 65.01%.
With Logsitic regression performing poorly, we will test Random Forest Regression.
First, we tune the Random Forest Model to get the best mtry (number of variables sampled at each split).
# Tuning Random Forest to get the best mtry (number of variables random sampled as candidates at each split)
tune= tuneRF(data_train[,-2],data_train[,2], stepFactor = 0.5, plot=TRUE,
ntreeTry=75,trace=TRUE, improve=0.05)
## mtry = 3 OOB error = 21.46%
## Searching left ...
## mtry = 6 OOB error = 21.34%
## 0.00555 0.05
## Searching right ...
## mtry = 1 OOB error = 25.41%
## -0.184 0.05
Note: mtry is the number of predictors sampled for splitting at each node.
The chart above helps us choose the best mtry to get a minimal Out of Bag Error. Obviously, 6 is the best mtry with less than 22% OOB error.
rf<- randomForest(cut~ .,data=data_train,mtry=6, ntree=75) #fitting RandomForest Classification
pred = predict(rf, newdata=data_test[,-2]) # Prediction on test set
cm=confusionMatrix(data_test$cut,pred) #confusion matrix
print(rf)
##
## Call:
## randomForest(formula = cut ~ ., data = data_train, mtry = 6, ntree = 75)
## Type of random forest: classification
## Number of trees: 75
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 21.4%
## Confusion matrix:
## Fair Good Very Good Premium Ideal class.error
## Fair 1055 93 21 29 9 0.126
## Good 104 2637 796 125 50 0.290
## Very Good 10 575 5199 1347 1929 0.426
## Premium 0 43 954 7936 1093 0.208
## Ideal 5 31 816 584 14879 0.088
The random forest classifier has been modeled with mtry as 6 and ntree as 75..
xkabledply(cm$table,title="Confusion Matrix")
| Fair | Good | Very Good | Premium | Ideal | |
|---|---|---|---|---|---|
| Fair | 272 | 18 | 2 | 8 | 2 |
| Good | 34 | 648 | 204 | 24 | 19 |
| Very Good | 3 | 138 | 1330 | 322 | 472 |
| Premium | 0 | 8 | 225 | 1978 | 296 |
| Ideal | 2 | 9 | 194 | 136 | 3738 |
cm$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.90e-01 7.02e-01 7.82e-01 7.98e-01 4.49e-01
## AccuracyPValue McnemarPValue
## 0.00e+00 8.87e-44
The confusion matrix is seen above. Also, we get an accuracy score of 79.012% for the test dataset.
plot(rf)
The graph above shows that as the trees increases the Out of Bag Error reduces. While a larger size of trees greater than 75 does not reduce the out of bag error significantly.
round(importance(rf), 2)
## MeanDecreaseGini
## carat 1085
## color 1082
## clarity 986
## depth 6933
## table 7925
## price 3057
## x 3152
## y 3039
## z 1541
# Mean Decrease in Gini is the average (mean) of a variable's total decrease in node impurity
varImpPlot(rf)
The mean decrease in Gini coefficient is a measure of how each variable contributes to the homogeneity of the nodes and leaves in the resulting random forest. The higher the value of mean decrease Gini score, the higher the importance of the variable in the model.
In the above chart, table and depth have the highest importance in the Random Forest model.
set.seed(1)
dataset2= mydiamonds_clean[,c(2,3,4,6,7,8,9,10,11)]
test2= createDataPartition(dataset2$cut,p=.2, list= FALSE)
data_train2= dataset2[-test2,]
data_test2= dataset2[test2,]
rf2<- randomForest(cut~ .,data=data_train2,mtry=6, ntree=75) #fitting RandomForest Classification
pred2 = predict(rf2, newdata=data_test2[,-2]) # Prediction on test set
cm2=confusionMatrix(data_test2$cut,pred2) #confusion matrix
print(rf2)
Removing least mean decrease gini feature (clarity) increases the Out of Bag error by 0.1%. We will stick to the first random forest model for predicting diamond cut type with minimal out of bag error.
check= data.frame()
# c('accuracy','precision','sensitivity','specificity')
check=rbind(check,c(round((sum(diag(cmlogit$table))/sum(cmlogit$table))*100,2)))
check=rbind(check,c(round((sum(diag(cm$table))/sum(cm$table))*100,2)))
rownames(check)=c('logistic','random forest')
colnames(check)=c("Accuracy Score")
check$AverageSensitiviy=c((sum(cmlogit$byClass[1:5])/5)*100,(sum(cm$byClass[1:5])/5)*100)
check$AverageSpecificity=c((sum(cmlogit$byClass[6:10])/5)*100,(sum(cm$byClass[6:10])/5)*100)
check$AveragePrecision=c((sum(cmlogit$byClass[21:25])/5)*100,(sum(cm$byClass[21:25])/5)*100)
barplot(height=as.matrix(check),beside=TRUE,names.arg = c("Accuracy","Sensitivity","Specificity","Precision"),legend.text = c("Logistic","RandoForest"),col=c("blue","green"))
The barplot above gives us a pictorial view of how Random Forest surpasses Logistic regression for classifying diamond cut type.
We want to build some KNN models to help cutting workers to decide which cutting method should be used on which diamond.
### pick numeric columns only
knndata = diamonds[,c(2,3,6,7,9,10,11)]
knndata = as.data.frame(scale(knndata[,c(1,3,4,5,6,7)], center = TRUE, scale = TRUE))
We use all numeric variables as independent variables except price, because cutting workers don’t know the sale price before cutting.
set.seed(1000)
knn_sample <- sample(2, nrow(knndata), replace=TRUE, prob=c(0.75, 0.25))
knn_training <- knndata[knn_sample==1, ]
knn_test <- knndata[knn_sample==2, ]
trainLabels <- diamonds[knn_sample==1, 3]
testLabels <- diamonds[knn_sample==2, 3]
The whole data set has been splitted to training set and testing set as 3:1.
loadPkg("FNN")
loadPkg("gmodels")
loadPkg("caret")
knn_pred <- knn(train = knn_training, test = knn_test, cl=trainLabels, k=7)
KNNCross <- CrossTable(testLabels, knn_pred, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 13491
##
##
## | knn_pred
## testLabels | Fair | Good | Ideal | Premium | Very Good | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Fair | 305 | 63 | 8 | 14 | 6 | 396 |
## | 0.770 | 0.159 | 0.020 | 0.035 | 0.015 | 0.029 |
## | 0.852 | 0.055 | 0.001 | 0.004 | 0.003 | |
## | 0.023 | 0.005 | 0.001 | 0.001 | 0.000 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Good | 46 | 754 | 26 | 104 | 287 | 1217 |
## | 0.038 | 0.620 | 0.021 | 0.085 | 0.236 | 0.090 |
## | 0.128 | 0.660 | 0.004 | 0.027 | 0.143 | |
## | 0.003 | 0.056 | 0.002 | 0.008 | 0.021 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Very Good | 2 | 257 | 713 | 873 | 1175 | 3020 |
## | 0.001 | 0.085 | 0.236 | 0.289 | 0.389 | 0.224 |
## | 0.006 | 0.225 | 0.117 | 0.224 | 0.585 | |
## | 0.000 | 0.019 | 0.053 | 0.065 | 0.087 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Premium | 0 | 58 | 421 | 2669 | 331 | 3479 |
## | 0.000 | 0.017 | 0.121 | 0.767 | 0.095 | 0.258 |
## | 0.000 | 0.051 | 0.069 | 0.685 | 0.165 | |
## | 0.000 | 0.004 | 0.031 | 0.198 | 0.025 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Ideal | 5 | 11 | 4918 | 236 | 209 | 5379 |
## | 0.001 | 0.002 | 0.914 | 0.044 | 0.039 | 0.399 |
## | 0.014 | 0.010 | 0.808 | 0.061 | 0.104 | |
## | 0.000 | 0.001 | 0.365 | 0.017 | 0.015 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 358 | 1143 | 6086 | 3896 | 2008 | 13491 |
## | 0.027 | 0.085 | 0.451 | 0.289 | 0.149 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
KNNCross
## $t
## y
## x Fair Good Ideal Premium Very Good
## Fair 305 63 8 14 6
## Good 46 754 26 104 287
## Very Good 2 257 713 873 1175
## Premium 0 58 421 2669 331
## Ideal 5 11 4918 236 209
##
## $prop.row
## y
## x Fair Good Ideal Premium Very Good
## Fair 0.770202 0.159091 0.020202 0.035354 0.015152
## Good 0.037798 0.619556 0.021364 0.085456 0.235826
## Very Good 0.000662 0.085099 0.236093 0.289073 0.389073
## Premium 0.000000 0.016671 0.121012 0.767174 0.095142
## Ideal 0.000930 0.002045 0.914296 0.043874 0.038855
##
## $prop.col
## y
## x Fair Good Ideal Premium Very Good
## Fair 0.85196 0.05512 0.00131 0.00359 0.00299
## Good 0.12849 0.65967 0.00427 0.02669 0.14293
## Very Good 0.00559 0.22485 0.11715 0.22408 0.58516
## Premium 0.00000 0.05074 0.06918 0.68506 0.16484
## Ideal 0.01397 0.00962 0.80808 0.06057 0.10408
##
## $prop.tbl
## y
## x Fair Good Ideal Premium Very Good
## Fair 0.022608 0.004670 0.000593 0.001038 0.000445
## Good 0.003410 0.055889 0.001927 0.007709 0.021273
## Very Good 0.000148 0.019050 0.052850 0.064710 0.087095
## Premium 0.000000 0.004299 0.031206 0.197836 0.024535
## Ideal 0.000371 0.000815 0.364539 0.017493 0.015492
Here is the result of KNN model when K is equal to 7.
We also checked different K values.
ResultDf = data.frame( k=numeric(0), Total.Accuracy= numeric(0), row.names = NULL )
for (kval in 3:15) {
knn_pred <- knn(train = knn_training, test = knn_test, cl=trainLabels, k=kval)
KNNCross <- CrossTable(testLabels, knn_pred, prop.chisq = FALSE)
print( paste("k = ", kval) )
KNNCross
# get confusion matrix
cm = confusionMatrix(knn_pred, reference = testLabels )
# get accuracy score
cmaccu = cm$overall['Accuracy']
print( paste("Total Accuracy = ", cmaccu ) )
# store into dataframe
cmt = data.frame(k=kval, Total.Accuracy = cmaccu, row.names = NULL )
ResultDf = rbind(ResultDf, cmt)
print( xkabledply( as.matrix(cm), title = paste("ConfusionMatrix for k = ",kval ) ) )
print( xkabledply(data.frame(cm$byClass), title=paste("k = ",kval)) )
}
xkabledply(ResultDf, "Total Accuracy Summary")
| k | Total.Accuracy |
|---|---|
| 3 | 0.704 |
| 4 | 0.722 |
| 5 | 0.723 |
| 6 | 0.728 |
| 7 | 0.728 |
| 8 | 0.731 |
| 9 | 0.730 |
| 10 | 0.732 |
| 11 | 0.732 |
| 12 | 0.733 |
| 13 | 0.732 |
| 14 | 0.733 |
| 15 | 0.733 |
Above is the accuracy scores of different K values.
ResultDf2 = data.frame( k=numeric(0), Total.Accuracy= numeric(0), row.names = NULL )
for (kval in 1:20) {
knn_pred <- knn(train = knn_training, test = knn_test, cl=trainLabels, k=2*kval+1)
cm = confusionMatrix(knn_pred, reference = testLabels )
cmaccu = cm$overall['Accuracy']
cmt = data.frame(k=2*kval+1, Total.Accuracy = cmaccu, row.names = NULL )
ResultDf2 = rbind(ResultDf2, cmt)}
loadPkg("ggplot2")
ggplot(ResultDf2,
aes(x = k, y = Total.Accuracy)) +
geom_line(color = "orange", size = 1.5) +
geom_point(size = 3) +
labs(title = "accuracy vs k")+
theme_bw() + theme(panel.grid=element_blank())
From this plot we can see that from 3 to 17, the accuracy increases when K increases, but when K is bigger than 17, the accuracy begins to decrease.
So, the appropriate K range for KNN model is from 11 to 17, because in this range we have a great accuracy values.